******************************************
// The do-file prepares the election data to match with score data by:name surname state and election year
******************************************

// use data downloaded from: https://electiondataarchive.org/

use "", clear


// Now the data is at the candidate level
// This file makes it at the district-level
drop release rg pev2 vot2 vv2 ivv2 to2 cv2 cvs2 pv2 pvs2 cv1 cvs1


// We want a constituency-level dataset 1946 to 2014

keep if yr>=1946

drop if yr==1959 

drop if pvs==-990 // missing
replace pvs=1 if pvs==-992 // uncontested

drop if cst_n=="rhode island 1" & can=="miska, walter j" & yr==1972

gen ele = mdy(mn, 1, yr)
format ele %d

g neg_pvs = -1*pvs
sort cst ele neg_pvs
 
bysort cst ele : gen rank=[_n]
 
bysort cst ele: g wvs = pvs1 if rank==1
bysort cst ele: g svs = pvs1 if rank==2

g dem = 0
replace dem = 1 if pty==180
replace dem = 1 if pty==183 // minnesota democratic-farmer-labor (see wiki)
replace dem = 1 if pty==181 // north dakota democratic-non partisan (see wiki)


g rep = 0
replace rep = 1 if pty==583


foreach party in dem rep other {
g `party'_seat = 0
g `party'_2nd = 0
g `party'vs = .
}

replace dem_seat = 1 if dem == 1 & rank ==1
replace rep_seat = 1 if rep == 1 & rank ==1
replace other_seat = 1 if rank ==1 & dem == 0 & rep == 0

replace dem_2nd = 1 if dem==1 & rank==2
replace rep_2nd = 1 if rep==1 & rank==2

replace demvs = pvs if dem==1
replace repvs = pvs if rep==1

g name = can if rank==1

replace sub=lower(sub)
replace sub=trim(sub)
rename sub state
replace cst_n=lower(cst_n)
replace cst_n=trim(cst_n)


// Collapse at district-election level
collapse (firstnm) yr mn cst_n pev1 ///
          demvs  repvs  wvs svs name state ///
		 (sum) dem_seat rep_seat other_seat ///
		 dem_2nd rep_2nd ///
		 , by(cst ele)
		 
label variable yr "Ëlection Year"
label variable mn  "Month"
label variable cst_n  "Constituency Name"
label variable cst  "Constituency Code" // not stable across elections
label variable pev1  "Number of Eligible Voters"
label variable dem_seat "Dem wins"
label variable demvs  "Dem Vote Share"
label variable dem_2nd "Dem 2nd"
label variable rep_seat "Rep win"
label variable rep_2nd "Rep 2nd"
label variable repvs "Rep Vote Share"
label variable other_seat "Other wins"
label variable wvs "Winner Vote Share"
label variable svs "Runner-Up Vote Share"
label variable ele "Election Date"
label variable name "Name of elected"
label variable state "State"

rename dem_seat dem
rename rep_seat rep


// Define Majorities
g demmaj = 1
replace demmaj = 0 if yr==1946
replace demmaj = 0 if yr==1952
replace demmaj = 0 if yr>=1994 & yr<=2004
replace demmaj = 0 if yr>=2010 & yr<=2014
label variable demmaj "Dem has national majority"

g maj = .
replace maj = 1 if dem==1 & demmaj==1
replace maj = 1 if rep==1 & demmaj==0
replace maj = 0 if dem==1 & demmaj==0
replace maj = 0 if rep==1 & demmaj==1
label variable maj "District belongs to majority"

// Define presidential alignments
g dempres = 0
replace dempres = 1 if yr>=2009 & yr<2017
replace dempres = 1 if yr>=1993 & yr<2000
replace dempres = 1 if yr>=1977 & yr<1981
replace dempres = 1 if yr>=1961 & yr<1969
replace dempres = 1 if yr>=1933 & yr<1953
label variable dempres "President is Dem"

g pres = .
replace pres = 1 if dem==1 & dempres==1
replace pres = 1 if rep==1 & dempres==0
replace pres = 0 if dem==1 & dempres==0
replace pres = 0 if rep==1 & dempres==1
label variable pres "District aligned with president"

/*
// Define Dem Vote share in deviation from annual mean
global yr 1946	1948	1950	1952	1954	1956	1958	1960	1962	1964	1966	1968	1970	1972	1974	1976	1978	1980	1982	1984	1986	1988	1990	1992	1994	1996	1998	2000	2002	2004	2006	2008	2010	2012	2014
g demvs_demean = .
foreach yr in $yr {
quietly: egen avg_demvs`yr' = mean(demvs) if yr==`yr'
quietly: replace demvs_demean = demvs - avg_demvs`yr' if yr==`yr'
drop avg_demvs`yr'
}


//Define Dem in deviation from annual avg (take away FE(t))
areg dem, absorb(yr)
predict dem_demean, res
label variable dem_demean "dem minus its annual avg"

// Define Dem Vote share in deviation from annual regional means
// Regions are defined according to the definition of bureau of econ analysis
g region=""
replace region="newengland" if state=="connecticut" | state=="maine" | state=="massachusetts" | state=="new hampshire" | state=="rhode island" | state=="vermont"
replace region="mideast" if   state=="delaware" | state=="maryland" | state=="new jersey" | state=="new york" | state=="pennsylvania"
replace region="lakes" if   state=="illinois" | state=="indiana" | state=="michigan" | state=="ohio" | state=="wisconsin"
replace region="plains"  if state=="iowa" | state=="kansas" | state=="minnesota" | state=="missouri" | state=="nebraska" | state=="north dakota" | state=="south dakota"
replace region="southeast" if   state=="alabama" | state=="arkansas" | state=="florida" | state=="georgia" | state=="kentucky" | state=="louisiana" | state=="mississippi" | state=="north carolina" | state=="south carolina" | state=="tennessee" | state=="virginia" | state=="west virginia"
replace region="southwest" if   state=="arizona" | state=="new mexico" | state=="oklahoma" | state=="texas"
replace region="rocky"  if state=="colorado" | state=="idaho" | state=="montana" | state=="utah" | state=="wyoming"
replace region="farwest" if  state=="alaska" | state=="california" | state=="hawaii" | state=="nevada" | state=="oregon" | state=="washington"
label variable region "BEA region"

egen region_yr=group(region yr)
areg demvs, absorb(region_yr)
predict demvs_demean_reg, res
label variable demvs_demean_reg "demvs minus its annual/region avg"

////Define Dem in deviation from annual/region avg (take away FE(tr))
areg dem, absorb(region_yr)
predict dem_demean_reg, res
label variable dem_demean_reg "dem minus its annual/region avg"
*/

// Define RDD variables
g dgap = .
replace dgap = demvs-svs if dem==1
replace dgap = demvs-wvs if dem_2nd==1
g dgap2=dgap*dgap
g dgap3=dgap2*dgap
g dwgap = dgap*dem
g dwgap2 = dwgap*dwgap
g dwgap3 = dwgap2*dwgap
g dsample =1 if dem==1 | dem_2nd==1
label variable dgap "MV: dem vs opponent"
label variable dwgap "dgap*dem"
label variable dsample "Dem 1st or 2nd"


g rgap = .
replace rgap = repvs-svs if rep==1
replace rgap = repvs-wvs if rep_2nd==1
g rgap2=rgap*rgap
g rgap3=rgap2*rgap
g rwgap = rgap*rep
g rwgap2 = rwgap*rwgap
g rwgap3 = rwgap2*rwgap
g rsample =1 if rep==1 | rep_2nd==1
label variable rgap "MV: rep vs opponent"
label variable rsample "rep 1st or 2nd"


g mgap = . // majority margin 
replace mgap = dgap if demmaj == 1
replace mgap = rgap if demmaj == 0
g mgap2=mgap*mgap
g mgap3=mgap2*mgap
g mwgap = mgap*maj
g mwgap2 = mwgap*mwgap
g mwgap3 = mwgap2*mwgap
label variable mgap "MV: distance to maj"

// Some operations to make district names consistent across years
global states alabama	alaska	arizona	arkansas	california	colorado	connecticut	delaware	florida	georgia	hawaii	idaho	illinois	indiana	iowa	kansas	kentucky	louisiana	maine	maryland	massachusetts	michigan	minnesota	mississippi	missouri	montana	nebraska	nevada	"new hampshire"	"new jersey"	"new mexico"	"new york"	"north carolina"	"north dakota"	ohio	oklahoma	oregon	pennsylvania	"rhode island"	"south carolina"	"south dakota"	tennessee	texas	utah	vermont	virginia	washington	"west virginia"	wisconsin	wyoming

		// instead of 01 : 1
		quietly {
		foreach state in $states {
		foreach n in "1" "2" "3" "4" "5" "6" "7" "8" "9" {
		replace cst_n ="`state' `n'" if cst_n=="`state' 0`n'"
		}
		}
		}

		// deal with at-large electoral districts
		quietly{
		foreach state in alaska delaware montana vermont wyoming {
		replace cst_n="`state' 1" if state=="`state'" & cst_n=="`state'"
		}
		foreach state in "north dakota" "south dakota" {
		replace cst_n="`state'" if cst_n=="`state' 1" & yr>=1986 & yr<=2006
		}
		}

// XT SET
encode cst_n, gen(cst_n_code)
label variable cst_n  "Constituency Name Encoded"
xtset cst_n_code yr, delta(2)


// Lag variables 
local lista dem rep other_seat dem_2nd rep_2nd demmaj maj mgap dgap dgap2 dgap3 dwgap dwgap2 dwgap3 rgap rgap2 rgap3 rwgap rwgap2 rwgap3 
foreach var in `lista' {
g lag`var'=L.`var'
}

// Lag sample
local lista dsample 
foreach var in `lista' {
g lag`var'=L.`var'
}

// FORWARD variables (to use in placebos)
local lista dem rep other_seat dem_2nd rep_2nd demmaj maj mgap dgap dgap2 dgap3 dwgap dwgap2 dwgap3 rgap rgap2 rgap3 rwgap rwgap2 rwgap3 
foreach var in `lista' {
g lead`var'=F.`var'
}

// FORWARD sample
local lista dsample 
foreach var in `lista' {
g lead`var'=F.`var'
}

// Redistricting years (they end with 2)
g redistrict = 0
foreach n in 0 1 2 3 4 5 6 7 8 9 {
replace redistrict = 1 if yr==19`n'2
replace redistrict = 1 if yr==20`n'2
}
label variable redistrict "Years with redistricting"

// Generate a Congress variable
g congress=0
replace congress=114 if yr==2014
replace congress=113 if yr==2012
replace congress=112 if yr==2010
replace congress=111 if yr==2008
replace congress=110 if yr==2006
replace congress=109 if yr==2004
replace congress=108 if yr==2002
replace congress=107 if yr==2000
replace congress=106 if yr==1998
replace congress=105 if yr==1996
replace congress=104 if yr==1994
replace congress=103 if yr==1992
replace congress=102 if yr==1990
replace congress=101 if yr==1988
replace congress=100 if yr==1986
replace congress=99 if yr==1984
replace congress=98 if yr==1982
replace congress=97 if yr==1980
replace congress=96 if yr==1978
replace congress=95 if yr==1976
replace congress=94 if yr==1974
replace congress=93 if yr==1972
replace congress=92 if yr==1970
replace congress=91 if yr==1968
replace congress=90 if yr==1966
replace congress=89 if yr==1964
replace congress=88 if yr==1962
replace congress=87 if yr==1960
replace congress=86 if yr==1958
replace congress=85 if yr==1956
replace congress=84 if yr==1954
replace congress=83 if yr==1952
replace congress=82 if yr==1950
replace congress=81 if yr==1948
replace congress=80 if yr==1946


// Define re-elected
g reelect = .
replace reelect = 1 if dem == lagdem & dem==1
replace reelect = 1 if rep == lagrep & rep==1
replace reelect = 0 if dem != lagdem & dem!=.
replace reelect = 0 if rep != lagrep & rep!=.
label variable reelect "1 if same party as last time wins"

// Generate subsamples
g sample = 1 if dem!=. & demvs!=. & lagdgap!=. 
g placebo_sample = 1 if dem!=. & demvs!=. & lagdgap!=. & leaddgap!=.


//Several samples with different yr balances
g sample11=1 if lagdemmaj==0  			| yr==1984
g sample22=1 if lagdemmaj==0 & yr>=1954 | lagdemmaj==1 & yr>=2008
g sample33=1 if lagdemmaj==0 & yr>=1996 | lagdemmaj==1 & yr>=1994
g sample44=1 if lagdemmaj==0 & yr>=1998 | lagdemmaj==1 & yr>=1990
g sample55=1 if lagdemmaj==0 & yr>=2000 | lagdemmaj==1 & yr>=1988
g sample66=1 if lagdemmaj==0 & yr>=2004 | lagdemmaj==1 & yr>=1986
g sample77=1 if lagdemmaj==0 & yr>=2006 | lagdemmaj==1 & yr>=1984
g sample88=1 if yr==1998 | lagdemmaj==1 & yr>=1980

label variable sample11  "11.1% dem years"
label variable sample22  "22.2% dem years"
label variable sample33  "33.3% dem years"
label variable sample44  "44.4% dem years"
label variable sample55  "55.5% dem years"
label variable sample66  "66.6% dem years"
label variable sample77  "77.7% dem years"
label variable sample88  "88.8% dem years"


//Other samples with different yr balances
g zample1=1 if yr<=2014 & yr>=1986
label variable zample1 "6 rep yrs + 6 dem yrs"
g zample2=1 if yr<=2014 & yr>=1980
label variable zample2 "6 rep yrs + 8 dem yrs"
g zample3=1 if yr<=2014 & yr>=1976
label variable zample3 "6 rep yrs + 10 dem yrs"
g zample4=1 if yr<=2014 & yr>=1970
label variable zample4 "6 rep yrs + 12 dem yrs"
g zample5=1 if yr<=2014 & yr>=1966
label variable zample5 "6 rep yrs + 14 dem yrs"
g zample6=1 if yr<=2014 & yr>=1960
label variable zample6 "6 rep yrs + 16 dem yrs"
g zample7=1 if yr<=2014 & yr>=1956
label variable zample7 "6 rep yrs + 18 dem yrs"


// Names and surnames
g name_dataset=name // keep it there for the record

replace name=trim(name)
replace name=lower(name)
g comma=1 if strpos(name, ",")
replace comma=0 if comma!=1
g surname=""
split name if comma==1 , p(,) limit(2)
replace surname = name1 if comma==1
g firstname = trim(name2) if comma==1
drop name1 name2

replace name=subinstr(name," jr."," ",.) if comma==0
replace name=subinstr(name," jr"," ",.) if comma==0
replace name = trim(name)

split name if comma!=1, p(" ")
foreach name in 1 2 3 4  {
replace name`name'=trim(name`name')
}


replace surname=name4 if name4!="" & comma!=1
replace surname=name3 if name3!="" & name4=="" & comma!=1
replace surname=name2 if name2!="" & name3=="" & comma!=1
replace surname=name1 if name1!="" & name2=="" & comma!=1
replace surname=trim(surname)

replace firstname=name1+" "+name2+" "+name3 if surname==name4 & name4!="" & comma!=1
replace firstname=name1+" "+name2 if surname==name3 & name3!="" & comma!=1
replace firstname=name1 if surname==name2 & name2!="" & comma!=1

drop name1 name2 name3 name4 

replace surname=lower(surname)
replace firstname=trim(firstname)
replace firstname=lower(firstname)

split name, p("")
replace surname="velazquez" if name1=="nydia" & state=="new york"
drop name1 name2 name3 name4 name5 name6

g surname_state_congress = surname+"_"+state+"_"+string(congress)
sort surname_state_congress
g omonimo=1 if surname_state_congress==surname_state_congress[_n-1] | surname_state_congress==surname_state_congress[_n+1]
replace firstname="" if omonimo!=1
g surname_state_congress_first = surname_state_congress+"_"+first

label variable surname "Surname (from name)"
label variable firstname "Firstname (from name)"



*save for further use
save "", replace

